Purpose

This document summarizes Rick Gilmore’s analysis of the Jaccard index data.

The goal is to explore ways to generate an empirical “null” distribution of the Jaccard index data to compare it to the observed data.

Set-up

Note: I have set eval=FALSE for a series of chunks below that generate the permuted sorting and Jaccard index data. These take many minutes to run.

Analysis plan

  1. Start with each participant’s raw sorting data within each wallpaper group.
  2. Permute the exemplars each participant sorted into similar piles by randomizing the mapping between the actual exemplar and the permuted exemplar id.
  1. Recalculate the Jaccard indices using the analysis/make.jaccard.df.R function.
  2. Do 1-3 n times, where n is large, probably 1,000.
  3. Compare the mean Jaccard indices (by wallpaper group) from the permuted set to the values we observed empirically.

Preliminary work

Let’s build and test a permutation function for the raw sorting data.

Now, let’s generate multiple permuted CSVs.

generate_n_sorting_permutations <-
  function(wp_group = "P1",
           n_permutations = 5) {
    csv_in <- paste0("analysis/data/", wp_group, "-sorting.csv")
    if (!file.exists(csv_in)) {
      stop(paste0("`csv_in` not found: ", csv_in))
    }
    
    df_in <- readr::read_csv(csv_in)
    
    df_exemplars <- df_in[, -c(1, 2, 23, 24)]
    out_m <- as.matrix(df_exemplars)
    for (p in 1:n_permutations) {
      csv_out <-
        paste0(
          "analysis/data/permutation_analysis/sorting_csv/",
          wp_group,
          "-sorting-perm-",
          stringr::str_pad(p, 3, pad = 0), ".csv"
        )
      
      for (r in 1:dim(out_m)[1]) {
        new_i <- sample(1:20)
        out_m[r, 1:20] <- out_m[r, new_i]
      }
      
      array_out <-
        as.data.frame(cbind(df_in$Participant, df_in$Set, out_m, df_in$Set_size, df_in$Group))
      
      # Rename!
      names(array_out) <-
        c("Participant",
          "Set",
          names(df_exemplars),
          "Set_size",
          "Group")
      array_out
    
      readr::write_csv(array_out, csv_out)
    }
  }

Then we test it.

generate_n_sorting_permutations()
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   Participant = col_character(),
##   Group = col_character()
## )
## ℹ Use `spec()` for the full column specifications.

Now, let’s confirm that we can calculate Jaccard indices from these data.

make_jaccard_csvs <- function(wallpaper_group = "P1",
                              duplicates = FALSE,
                              input_dir = 'analysis/data/permutation_analysis/sorting_csv/',
                              output_dir = 'analysis/data/permutation_analysis/jaccards/') {
  # Makes a data.frame from the raw sorting data
  
  # Load externals
  source("analysis/jaccard.data.R")
  source("analysis/jaccard.R")
  
  these_csvs <-
    list.files(input_dir, paste0("^", wallpaper_group, "\\-"), full.names = TRUE)
  purrr::map(these_csvs,
             calculate_save_jaccard_df,
             wallpaper_group,
             jaccard_dir = output_dir)
}

# Load a sorting permutation file, calculate the Jaccard indices, and (conditionally) save it to file.
calculate_save_jaccard_df <- function(this_csv,
                                      wallpaper_group,
                                      save_output = TRUE,
                                      jaccard_dir = "analysis/data/permutation_analysis/jaccards/",
                                      vb = FALSE) {
  
  this_fn <- basename(this_csv)
  this_perm_number <- stringr::str_extract(this_fn, "[0-9]{3}")
  out_fn <-
    paste0(jaccard_dir,
           wallpaper_group,
           "-jaccard-",
           this_perm_number,
           ".csv")
  
  this_df <- readr::read_csv(this_csv)
  
  # Calculate Jaccard
  jaccard_df <- jaccard.data(this_df)
  
  if (save_output) {
    if (vb) message(paste0('Saving ', out_fn))
    readr::write_csv(jaccard_df, out_fn)
  } else {
    jaccard_df
  }
}
make_jaccard_csvs()

Generate data

P1

generate_n_sorting_permutations("P1", n_permutations = 999)
make_jaccard_csvs("P1")

P31M

generate_n_sorting_permutations("P31M", n_permutations = 999)
make_jaccard_csvs("P31M")

P3M1

generate_n_sorting_permutations("P3M1", n_permutations = 999)
make_jaccard_csvs("P3M1")

P6

generate_n_sorting_permutations("P6", n_permutations = 999)
make_jaccard_csvs("P6")

P6M

generate_n_sorting_permutations("P6M", n_permutations = 999)
make_jaccard_csvs("P6M")

Analyze simulated results

Create helper functions

make_perm_jaccard_df <- function(this_csv) {
  this_fn <- basename(this_csv)
  this_perm_number <- stringr::str_extract(this_fn, "[0-9]{3}")
  this_df <- readr::read_csv(this_csv)
  
  this_df <- this_df %>%
    dplyr::mutate(
      .,
      exemplar_pair = paste0(
        stringr::str_extract(Exemplar.Row, "[0-9]{3}$"),
        "-",
        stringr::str_extract(Exemplar.Col, "[0-9]{3}$")
        ),
        perm = this_perm_number
    )
  
  this_df
}
make_aggregate_perm_jaccard_df <- function(wp_group = "P1",
                                           input_dir = "analysis/data/permutation_analysis/jaccards",
                                           save_csv = TRUE,
                                           output_dir = "analysis/data/permutation_analysis/aggregates",
                                           vb = TRUE) {
  these_csvs <-
    list.files(input_dir, paste0("^", wp_group, "\\-"), full.names = TRUE)
  df <- purrr::map_df(these_csvs, make_perm_jaccard_df)
  
  if (save_csv) {
    out_fn <- file.path(output_dir, paste0(wp_group, "-aggregate-perm-jaccard.csv"))
    if (vb) message(paste0("Saving ", out_fn))
    readr::write_csv(df, out_fn)
  } else {
    df
  }
}

Empirical distributions of Jaccard indices by group and exemplar-pair

P1

Import the data.

P1_perm_df <- make_aggregate_perm_jaccard_df("P1")
P1_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P1-aggregate-perm-jaccard.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Exemplar.Row = col_double(),
##   Exemplar.Col = col_double(),
##   Jaccard = col_double(),
##   Group = col_character(),
##   exemplar_pair = col_character(),
##   perm = col_character()
## )

Visualize.

P1_perm_df %>%
  ggplot2::ggplot(.) +
  ggplot2::aes(x = Jaccard) +
  ggplot2::geom_histogram(bins = 50)

Generate summary stats by exemplar pair.

P1_perm_stats_df <- P1_perm_df %>%
  dplyr::group_by(., Group, exemplar_pair) %>%
  dplyr::summarize(., jaccard_mean = mean(Jaccard),
                   jaccard_sd = sd(Jaccard))
## `summarise()` has grouped output by 'Group'. You can override using the `.groups` argument.

P31M

Import the data.

P31M_perm_df <- make_aggregate_perm_jaccard_df("P31M")
P31M_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P31M-aggregate-perm-jaccard.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Exemplar.Row = col_double(),
##   Exemplar.Col = col_double(),
##   Jaccard = col_double(),
##   Group = col_character(),
##   exemplar_pair = col_character(),
##   perm = col_character()
## )

Visualize.

P31M_perm_df %>%
  ggplot2::ggplot(.) +
  ggplot2::aes(x = Jaccard) +
  ggplot2::geom_histogram(bins = 50)

Generate summary stats by exemplar pair.

P31M_perm_stats_df <- P31M_perm_df %>%
  dplyr::group_by(., Group, exemplar_pair) %>%
  dplyr::summarize(., jaccard_mean = mean(Jaccard),
                   jaccard_sd = sd(Jaccard))
## `summarise()` has grouped output by 'Group'. You can override using the `.groups` argument.

P3M1

Import the data.

P3M1_perm_df <- make_aggregate_perm_jaccard_df("P3M1")
P3M1_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P3M1-aggregate-perm-jaccard.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Exemplar.Row = col_double(),
##   Exemplar.Col = col_double(),
##   Jaccard = col_double(),
##   Group = col_character(),
##   exemplar_pair = col_character(),
##   perm = col_character()
## )

Visualize.

P3M1_perm_df %>%
  ggplot2::ggplot(.) +
  ggplot2::aes(x = Jaccard) +
  ggplot2::geom_histogram(bins = 50)

Generate summary stats by exemplar pair.

P3M1_perm_stats_df <- P3M1_perm_df %>%
  dplyr::group_by(., Group, exemplar_pair) %>%
  dplyr::summarize(., jaccard_mean = mean(Jaccard),
                   jaccard_sd = sd(Jaccard))
## `summarise()` has grouped output by 'Group'. You can override using the `.groups` argument.

P6

Import the data.

P6_perm_df <- make_aggregate_perm_jaccard_df("P6")
P6_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P6-aggregate-perm-jaccard.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Exemplar.Row = col_double(),
##   Exemplar.Col = col_double(),
##   Jaccard = col_double(),
##   Group = col_character(),
##   exemplar_pair = col_character(),
##   perm = col_character()
## )

Visualize.

P6_perm_df %>%
  ggplot2::ggplot(.) +
  ggplot2::aes(x = Jaccard) +
  ggplot2::geom_histogram(bins = 50)

Generate summary stats by exemplar pair.

P6_perm_stats_df <- P6_perm_df %>%
  dplyr::group_by(., Group, exemplar_pair) %>%
  dplyr::summarize(., jaccard_mean = mean(Jaccard),
                   jaccard_sd = sd(Jaccard))
## `summarise()` has grouped output by 'Group'. You can override using the `.groups` argument.

P6M

Import the data.

P6M_perm_df <- make_aggregate_perm_jaccard_df("P6M")
P6M_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P6M-aggregate-perm-jaccard.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Exemplar.Row = col_double(),
##   Exemplar.Col = col_double(),
##   Jaccard = col_double(),
##   Group = col_character(),
##   exemplar_pair = col_character(),
##   perm = col_character()
## )

Visualize.

P6M_perm_df %>%
  ggplot2::ggplot(.) +
  ggplot2::aes(x = Jaccard) +
  ggplot2::geom_histogram(bins = 50)

Generate summary stats by exemplar pair.

P6M_perm_stats_df <- P6M_perm_df %>%
  dplyr::group_by(., Group, exemplar_pair) %>%
  dplyr::summarize(., jaccard_mean = mean(Jaccard),
                   jaccard_sd = sd(Jaccard))
## `summarise()` has grouped output by 'Group'. You can override using the `.groups` argument.

Aggregating across groups

jaccard_perm_df <- rbind(P1_perm_df, P31M_perm_df, P3M1_perm_df, P6_perm_df, P6M_perm_df)

Visualization.

jaccard_perm_df %>%
  ggplot(.) +
  aes(Jaccard, color = Group) +
  facet_grid(Group ~ .) +
  geom_boxplot(bins = 50)
## Warning: Ignoring unknown parameters: bins

jaccard_perm_df %>%
  ggplot(.) +
  aes(Jaccard, color = Group) +
  facet_grid(Group ~ .) +
  geom_boxplot(bins = 50)
## Warning: Ignoring unknown parameters: bins

jaccard_perm_df %>%
  ggplot(.) +
  aes(x = Group, y = Jaccard) +
  geom_violin()

These plots show that the mean differences in Jaccard indices are reflected in the participants’ data are shown in the permuted data, too. This makes sense since the participants detected regularities and sorted the exemplars into different numbers of sets. In permuting the exemplar identifiers within participants, we keep some of this structure.

Let’s try aggregating the by-exemplar statistics.

jaccard_perm_stats_df <- rbind(P1_perm_stats_df, P31M_perm_stats_df, P3M1_perm_stats_df, P6_perm_stats_df, P6M_perm_stats_df)

# Sort by group, exemplar_pair
jaccard_perm_stats_df <- jaccard_perm_stats_df %>%
  dplyr::arrange(Group, exemplar_pair)
jaccard_perm_stats_df %>%
  ggplot(.) +
  aes(x = jaccard_mean, fill = Group) +
  geom_histogram() +
  facet_grid(Group ~ .) + 
  ggtitle("Mean exemplar-pair Jaccard indices for permuted data")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

jaccard_perm_stats_df %>%
  ggplot(.) +
  aes(x = jaccard_sd, fill = Group) +
  geom_histogram() +
  facet_grid(Group ~ .) +
  ggtitle("Standard deviation of exemplar-pair Jaccard indices for permuted data")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Compare observed Jaccard to empirical distribution

Load the observed data and clean it.

jaccard_observed_df <-
  readr::read_csv("analysis/data/jaccard-no-duplicates.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Exemplar.Row = col_double(),
##   Exemplar.Col = col_double(),
##   Jaccard = col_double(),
##   Group = col_character()
## )
jaccard_observed_df <- jaccard_observed_df %>%
  dplyr::mutate(.,
                exemplar_pair = paste0(
                  stringr::str_extract(Exemplar.Row, "[0-9]{3}$"),
                  "-",
                  stringr::str_extract(Exemplar.Col, "[0-9]{3}$")
                )) %>%
  dplyr::arrange(., Group, exemplar_pair)

Now, merge the permuted data with the observed data.

jaccard_merged_df <- dplyr::left_join(jaccard_perm_stats_df,
                                      jaccard_observed_df,
                                      by = c("Group", "exemplar_pair"))

# Rearrange columns for convenience
jaccard_merged_df <- jaccard_merged_df %>%
  dplyr::select(., Group, exemplar_pair, Jaccard, jaccard_mean, jaccard_sd, Exemplar.Row, Exemplar.Col)

# Rename variables for clarity
jaccard_merged_df <- jaccard_merged_df %>%
  dplyr::rename(., group = Group, jaccard_obs = Jaccard, 
                jaccard_emp_mean = jaccard_mean,
                jaccard_emp_sd = jaccard_sd,
                exemplar_row = Exemplar.Row,
                exemplar_col = Exemplar.Col)

Calculate empirical z as \(z_{emp}=J_{obs}-\mu_{J}\) for each exemplar pair.

jaccard_merged_df <- jaccard_merged_df %>%
  dplyr::mutate(., z_emp = (jaccard_obs-jaccard_emp_mean)/jaccard_emp_sd,
                p_z_emp = pnorm(z_emp, jaccard_emp_mean, jaccard_emp_sd, lower.tail = FALSE))

Plot a histogram of z_emp.

jaccard_merged_df %>%
  ggplot(.) +
  aes(z_emp, fill = group) +
  geom_histogram() +
  facet_grid(group ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Curiously, P31M, P6, P6M and P3M1 have exemplar pairs whose observed Jaccard indices are substantially larger than the empirically derived reference (null) distribution even though the mean Jaccard indices for P1 are the largest.

Just for fun, let’s print the exemplar pairs whose z_emp exceed 3.7190165` (\(p<.0001\)).

jaccard_merged_df %>%
  dplyr::filter(., z_emp > 4) %>%
  dplyr::arrange(., group, desc(jaccard_emp_mean)) %>%
  knitr::kable(.)
group exemplar_pair jaccard_obs jaccard_emp_mean jaccard_emp_sd exemplar_row exemplar_col z_emp p_z_emp
P1 008-009 0.4347826 0.1997302 0.0523893 101008 101009 4.486650 0
P31M 006-016 0.3469388 0.1544913 0.0455907 115006 115016 4.221202 0
P31M 002-020 0.4347826 0.1541413 0.0445330 115002 115020 6.301875 0
P31M 004-009 0.4666667 0.1536301 0.0458028 115004 115009 6.834437 0
P31M 008-015 0.3469388 0.1534583 0.0454080 115008 115015 4.260934 0
P31M 014-020 0.5000000 0.1532609 0.0443117 115014 115020 7.824999 0
P31M 007-020 0.3469388 0.1530137 0.0472758 115007 115020 4.101998 0
P31M 002-007 0.6500000 0.1528091 0.0456546 115002 115007 10.890260 0
P31M 002-014 0.4347826 0.1523665 0.0464296 115002 115014 6.082672 0
P31M 007-014 0.3469388 0.1520057 0.0465605 115007 115014 4.186665 0
P31M 009-014 0.3469388 0.1501004 0.0442869 115009 115014 4.444622 0
P3M1 019-020 0.4042553 0.1436215 0.0475534 114019 114020 5.480869 0
P3M1 003-016 0.3469388 0.1431156 0.0447568 114003 114016 4.554010 0
P3M1 011-019 0.3750000 0.1415372 0.0445837 114011 114019 5.236508 0
P6 001-017 0.3200000 0.1384481 0.0436607 116001 116017 4.158245 0
P6 007-009 0.4666667 0.1380211 0.0441697 116007 116009 7.440517 0
P6 014-017 0.3200000 0.1376545 0.0451294 116014 116017 4.040509 0
P6 013-019 0.4888889 0.1373315 0.0433875 116013 116019 8.102725 0
P6 005-013 0.3333333 0.1365565 0.0442138 116005 116013 4.450579 0
P6 002-011 0.4042553 0.1363300 0.0431172 116002 116011 6.213881 0
P6 008-009 0.3541667 0.1361674 0.0434391 116008 116009 5.018501 0
P6 008-020 0.3265306 0.1359853 0.0425070 116008 116020 4.482680 0
P6 006-019 0.4666667 0.1356848 0.0441123 116006 116019 7.503171 0
P6 003-014 0.3265306 0.1354559 0.0438507 116003 116014 4.357394 0
P6 006-013 0.5581395 0.1348380 0.0450430 116006 116013 9.397711 0
P6M 008-010 0.3333333 0.1448769 0.0455945 117008 117010 4.133318 0
P6M 008-020 0.3541667 0.1437420 0.0410528 117008 117020 5.125709 0
P6M 001-011 0.3333333 0.1433422 0.0465964 117001 117011 4.077378 0
P6M 010-020 0.3829787 0.1432528 0.0453747 117010 117020 5.283256 0

Plot a histogram of p_z_emp or the probability associated with the empirical z.

jaccard_merged_df %>%
  ggplot(.) +
  aes(p_z_emp, fill = group) +
  geom_histogram() +
  facet_grid(group ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Heatmap visualizations

Create function to extract data and convert to matrix.

extract_exemplar_index <- function(e) {
  stringr::str_extract(e, '[0-9]{2}$')
}

extract_exemplar_emp_z_matrix <- function(wp_group = "P1", df = jaccard_merged_df) {
  this_df <- df %>%
    dplyr::filter(., group == wp_group)
  
  this_matrix <- matrix(nrow = 20, ncol = 20)
  
  for (r in 1:190) {
    this_matrix[as.numeric(extract_exemplar_index(this_df$exemplar_row[r])), 
                as.numeric(extract_exemplar_index(this_df$exemplar_col[r]))] <-
      this_df$z_emp[r]
  }
  
  this_matrix
}

wp_emp_mean <- function(wp_group = "P1", df = jaccard_merged_df) {
  this_df <- df %>%
    dplyr::filter(., group == wp_group)
  
  mean(this_df$jaccard_emp_mean)
}

wp_emp_sd <- function(wp_group = "P1", df = jaccard_merged_df) {
  this_df <- df %>%
    dplyr::filter(., group == wp_group)
  
  sd(this_df$jaccard_emp_mean)
}

Test the functions.

wp_emp_mean()
## [1] 0.2008955
wp_emp_sd()
## [1] 0.001735995
extract_exemplar_emp_z_matrix()
##       [,1]     [,2]       [,3]       [,4]       [,5]        [,6]        [,7]
##  [1,]   NA 1.278546 -0.4063873 -1.5587622  0.4846571  0.04668384 -0.86372090
##  [2,]   NA       NA -0.4582298 -0.4274744 -0.4186175  1.79035535 -0.84568146
##  [3,]   NA       NA         NA  0.8835449 -0.3725606 -0.40307115  1.26660435
##  [4,]   NA       NA         NA         NA  1.3404188  2.26643744 -0.04013945
##  [5,]   NA       NA         NA         NA         NA  0.47331811  0.88191730
##  [6,]   NA       NA         NA         NA         NA          NA  0.85560584
##  [7,]   NA       NA         NA         NA         NA          NA          NA
##  [8,]   NA       NA         NA         NA         NA          NA          NA
##  [9,]   NA       NA         NA         NA         NA          NA          NA
## [10,]   NA       NA         NA         NA         NA          NA          NA
## [11,]   NA       NA         NA         NA         NA          NA          NA
## [12,]   NA       NA         NA         NA         NA          NA          NA
## [13,]   NA       NA         NA         NA         NA          NA          NA
## [14,]   NA       NA         NA         NA         NA          NA          NA
## [15,]   NA       NA         NA         NA         NA          NA          NA
## [16,]   NA       NA         NA         NA         NA          NA          NA
## [17,]   NA       NA         NA         NA         NA          NA          NA
## [18,]   NA       NA         NA         NA         NA          NA          NA
## [19,]   NA       NA         NA         NA         NA          NA          NA
## [20,]   NA       NA         NA         NA         NA          NA          NA
##             [,8]        [,9]       [,10]       [,11]      [,12]       [,13]
##  [1,] -0.4170658  0.91998657  0.39258792  0.08265938 -0.8205435  0.04047028
##  [2,]  0.3820315  0.90692441 -0.88589284  2.43642158 -2.3758630 -1.29048260
##  [3,]  1.3493598  0.44475779 -0.42231484  1.48514030  1.3501212 -1.24717244
##  [4,]  1.2670747  0.91684340 -0.06783505  1.95939390 -0.0363671 -0.48506776
##  [5,] -0.8272640 -0.01028528 -0.39954466  0.53513838  0.9527189 -0.03654542
##  [6,] -0.8246242 -1.18513659 -1.55311620  0.52395716 -2.3053761  0.02056386
##  [7,] -1.9834361 -0.81164814  1.72676587 -1.52011445  2.1843667 -1.19457034
##  [8,]         NA  4.48664976 -1.57559245  0.96491370 -0.3760906 -0.82638977
##  [9,]         NA          NA  0.46071891  1.01567374 -0.8051875 -0.44645379
## [10,]         NA          NA          NA -0.72392426  0.4382522 -1.23232410
## [11,]         NA          NA          NA          NA -1.9134128  0.47737007
## [12,]         NA          NA          NA          NA         NA  1.26378565
## [13,]         NA          NA          NA          NA         NA          NA
## [14,]         NA          NA          NA          NA         NA          NA
## [15,]         NA          NA          NA          NA         NA          NA
## [16,]         NA          NA          NA          NA         NA          NA
## [17,]         NA          NA          NA          NA         NA          NA
## [18,]         NA          NA          NA          NA         NA          NA
## [19,]         NA          NA          NA          NA         NA          NA
## [20,]         NA          NA          NA          NA         NA          NA
##               [,14]       [,15]       [,16]        [,17]       [,18]
##  [1,] -8.450266e-01  2.25346232  0.46654942  1.266958620 -0.45569165
##  [2,] -7.454175e-02 -0.42818102 -0.03996971 -0.843992742 -0.45993890
##  [3,] -2.353633e-02  2.17214667 -0.40745546  1.805807998  0.01955637
##  [4,] -8.772070e-01 -0.84589851 -1.28706043 -0.005957962 -0.04586250
##  [5,] -1.090300e-02  1.24828571  0.86896324 -1.551022672  0.94420009
##  [6,] -1.185326e+00 -0.06913131 -1.16833936 -0.444435357  1.23212863
##  [7,] -4.587557e-01  2.28144774  1.77045809  1.769055737  0.36569635
##  [8,] -5.251895e-05 -2.00041056 -1.86882703 -0.448893957  0.41133069
##  [9,]  5.565490e-03 -0.40475349 -1.16208327  1.309903275  0.03547131
## [10,] -4.538015e-01  2.74519701  3.88341611  0.407380715 -2.30123847
## [11,]  4.206246e-02 -0.35592516 -1.50125876  0.472141256 -0.35533988
## [12,]  8.320571e-01  1.28992131 -0.02461837 -0.836023062 -1.22388245
## [13,]  4.032674e-01 -0.41612144 -1.20419690 -1.183330632  1.77401400
## [14,]            NA -0.83358345 -0.05385836 -0.841892714 -0.43985972
## [15,]            NA          NA  1.80969188  0.894490657 -0.47778554
## [16,]            NA          NA          NA  2.282295285 -2.38702604
## [17,]            NA          NA          NA           NA -2.23676330
## [18,]            NA          NA          NA           NA          NA
## [19,]            NA          NA          NA           NA          NA
## [20,]            NA          NA          NA           NA          NA
##              [,19]       [,20]
##  [1,]  0.434817431 -0.85052280
##  [2,] -0.407690961 -0.47929098
##  [3,] -0.387107712 -0.84816357
##  [4,] -0.806827174  0.35969809
##  [5,] -0.773074742 -0.37698353
##  [6,] -0.437145373 -0.37783743
##  [7,]  3.476615059  0.78358220
##  [8,] -1.289939900 -1.92388138
##  [9,] -0.405258726 -1.53542408
## [10,]  1.320221281  2.86840721
## [11,] -2.295820616 -0.36815069
## [12,]  1.870930810  0.82816252
## [13,] -0.421732928 -0.44874426
## [14,]  1.277881102 -0.05811713
## [15,]  0.862228210 -0.43978299
## [16,] -0.006930371  3.36936846
## [17,] -0.002987090  1.32621659
## [18,] -1.854978934 -2.01838497
## [19,]           NA  2.29590443
## [20,]           NA          NA

Now, let’s turn to the heatmap.

# Based on stats::heatmap(), but with different defaults for axis labels
my_heatmap <- function (x, Rowv = NULL, Colv = if (symm) "Rowv" else NULL, 
    distfun = dist, hclustfun = hclust, reorderfun = function(d, 
        w) reorder(d, w), add.expr, symm = FALSE, revC = identical(Colv, 
        "Rowv"), scale = c("row", "column", "none"), na.rm = TRUE, 
    margins = c(5, 5), ColSideColors, RowSideColors, cexRow = 0.2 + 
        1/log10(nr), cexCol = 0.2 + 1/log10(nc), labRow = NULL, 
    labCol = NULL, main = NULL, xlab = NULL, ylab = NULL, 
    xside = 2, yside = 3, # 1=bottom, 2=left, 3=top, 4=right
    keep.dendro = FALSE, 
    verbose = getOption("verbose"), ...) 
{
    scale <- if (symm && missing(scale)) 
        "none"
    else match.arg(scale)
    if (length(di <- dim(x)) != 2 || !is.numeric(x)) 
        stop("'x' must be a numeric matrix")
    nr <- di[1L]
    nc <- di[2L]
    if (nr <= 1 || nc <= 1) 
        stop("'x' must have at least 2 rows and 2 columns")
    if (!is.numeric(margins) || length(margins) != 2L) 
        stop("'margins' must be a numeric vector of length 2")
    doRdend <- !identical(Rowv, NA)
    doCdend <- !identical(Colv, NA)
    if (!doRdend && identical(Colv, "Rowv")) 
        doCdend <- FALSE
    if (is.null(Rowv)) 
        Rowv <- rowMeans(x, na.rm = na.rm)
    if (is.null(Colv)) 
        Colv <- colMeans(x, na.rm = na.rm)
    if (doRdend) {
        if (inherits(Rowv, "dendrogram")) 
            ddr <- Rowv
        else {
            hcr <- hclustfun(distfun(x))
            ddr <- as.dendrogram(hcr)
            if (!is.logical(Rowv) || Rowv) 
                ddr <- reorderfun(ddr, Rowv)
        }
        if (nr != length(rowInd <- order.dendrogram(ddr))) 
            stop("row dendrogram ordering gave index of wrong length")
    }
    else rowInd <- 1L:nr
    if (doCdend) {
        if (inherits(Colv, "dendrogram")) 
            ddc <- Colv
        else if (identical(Colv, "Rowv")) {
            if (nr != nc) 
                stop("Colv = \"Rowv\" but nrow(x) != ncol(x)")
            ddc <- ddr
        }
        else {
            hcc <- hclustfun(distfun(if (symm) 
                x
            else t(x)))
            ddc <- as.dendrogram(hcc)
            if (!is.logical(Colv) || Colv) 
                ddc <- reorderfun(ddc, Colv)
        }
        if (nc != length(colInd <- order.dendrogram(ddc))) 
            stop("column dendrogram ordering gave index of wrong length")
    }
    else colInd <- 1L:nc
    x <- x[rowInd, colInd]
    labRow <- labRow[rowInd] %||% rownames(x) %||% (1L:nr)[rowInd]
    labCol <- labCol[colInd] %||% colnames(x) %||% (1L:nc)[colInd]
    if (scale == "row") {
        x <- sweep(x, 1L, rowMeans(x, na.rm = na.rm), check.margin = FALSE)
        sx <- apply(x, 1L, sd, na.rm = na.rm)
        x <- sweep(x, 1L, sx, "/", check.margin = FALSE)
    }
    else if (scale == "column") {
        x <- sweep(x, 2L, colMeans(x, na.rm = na.rm), check.margin = FALSE)
        sx <- apply(x, 2L, sd, na.rm = na.rm)
        x <- sweep(x, 2L, sx, "/", check.margin = FALSE)
    }
    lmat <- rbind(c(NA, 3), 2:1)
    lwid <- c(if (doRdend) 1 else 0.05, 4)
    lhei <- c((if (doCdend) 1 else 0.05) + if (!is.null(main)) 0.2 else 0, 
        4)
    if (!missing(ColSideColors)) {
        if (!is.character(ColSideColors) || length(ColSideColors) != 
            nc) 
            stop("'ColSideColors' must be a character vector of length ncol(x)")
        lmat <- rbind(lmat[1, ] + 1, c(NA, 1), lmat[2, ] + 1)
        lhei <- c(lhei[1L], 0.2, lhei[2L])
    }
    if (!missing(RowSideColors)) {
        if (!is.character(RowSideColors) || length(RowSideColors) != 
            nr) 
            stop("'RowSideColors' must be a character vector of length nrow(x)")
        lmat <- cbind(lmat[, 1] + 1, c(rep(NA, nrow(lmat) - 1), 
            1), lmat[, 2] + 1)
        lwid <- c(lwid[1L], 0.2, lwid[2L])
    }
    lmat[is.na(lmat)] <- 0
    if (verbose) {
        cat("layout: widths = ", lwid, ", heights = ", lhei, 
            "; lmat=\n")
        print(lmat)
    }
    dev.hold()
    on.exit(dev.flush())
    op <- par(no.readonly = TRUE)
    on.exit(par(op), add = TRUE)
    layout(lmat, widths = lwid, heights = lhei, respect = TRUE)
    if (!missing(RowSideColors)) {
        par(mar = c(margins[1L], 0, 0, 0.5))
        image(rbind(if (revC) 
            nr:1L
        else 1L:nr), col = RowSideColors[rowInd], axes = FALSE)
    }
    if (!missing(ColSideColors)) {
        par(mar = c(0.5, 0, 0, margins[2L]))
        image(cbind(1L:nc), col = ColSideColors[colInd], axes = FALSE)
    }
    par(mar = c(margins[1L], 0, 0, margins[2L]))
    if (!symm || scale != "none") 
        x <- t(x)
    if (revC) {
        iy <- nr:1
        if (doRdend) 
            ddr <- rev(ddr)
        x <- x[, iy]
    }
    else iy <- 1L:nr
    image(1L:nc, 1L:nr, x, xlim = 0.5 + c(0, nc), ylim = 0.5 + 
        c(0, nr), axes = FALSE, xlab = "", ylab = "", ...)
    axis(xside, 1L:nc, labels = labCol, las = 2, line = -0.5, tick = 0, 
        cex.axis = cexCol)
    if (!is.null(xlab)) 
        mtext(xlab, side = xside, line = margins[1L] - 1.25)
    axis(yside, iy, labels = labRow, las = 2, line = -0.5, tick = 0, 
        cex.axis = cexRow)
    if (!is.null(ylab)) 
        mtext(ylab, side = yside, line = margins[2L] - 1.25)
    if (!missing(add.expr)) 
        eval.parent(substitute(add.expr))
    par(mar = c(margins[1L], 0, 0, 0))
    if (doRdend) 
        plot(ddr, horiz = TRUE, axes = FALSE, yaxs = "i", leaflab = "none")
    else frame()
    par(mar = c(0, 0, if (!is.null(main)) 1 else 0, margins[2L]))
    if (doCdend) 
        plot(ddc, axes = FALSE, xaxs = "i", leaflab = "none")
    else if (!is.null(main)) 
        frame()
    if (!is.null(main)) {
        par(xpd = NA)
        title(main, cex.main = 1.5 * op[["cex.main"]])
    }
    invisible(list(rowInd = rowInd, colInd = colInd, Rowv = if (keep.dendro && 
        doRdend) ddr, Colv = if (keep.dendro && doCdend) ddc))
}

Combine the above into a new plotting function.

value_breaks <- c(0, .0001, .001, .01, .1, 1)
color_palate <- RColorBrewer::brewer.pal(length(value_breaks)-1, "Oranges")
rev_color_palate <- color_palate[length(color_palate):1]
value_colors <-
    colorRampPalette(color_palate)(length(rev_color_palate))
legend_text <- c("<.0001", "<.001", "<.01", ".1", ">.1")
plot_heatmap_p <- function(wp_group = "P1",
                         df = jaccard_merged_df,
                         show_legend = TRUE) {
  # Select wp_group

  j_matrix <- extract_exemplar_emp_z_matrix(wp_group)
  
  my_heatmap(
    j_matrix,
    Rowv = NA,
    Colv = NA,
    symm = TRUE,
    margins = c(30,30),
    col = value_colors,
    breaks = value_breaks,
    cexRow = 3.0,
    cexCol = 3.0
    )
  
  if (show_legend) {
    legend(x = "bottomright",
           legend = legend_text,
           fill = value_colors)
  }
}

Test the heatmap plotting function.

plot_heatmap_p("P1")

plot_heatmap_p("P31M")

plot_heatmap_p("P3M1")

plot_heatmap_p("P6")

plot_heatmap_p("P6M")

Why these graphs are so different from those in exemplar-jaccard-plots.Rmd has me stumped. I’m especially confused about why the top and side margins don’t seem to behave as expeted.

I’m going to try a different approach to summarizing these. Let’s add a categorical variable to indicate \(p\) value of the empirical z.

jaccard_merged_df <- jaccard_merged_df %>%
  dplyr::mutate(.,
                p_z_cuts =
                  cut(
                    p_z_emp,
                    c(0, .0001, .001, .01, .1, 1),
                    labels = c("<.0001", "<.001", "<.01", "<.1", ">.1"),
                    include.lowest = TRUE
                  ))
jaccard_merged_df %>%
  ggplot(.) +
  aes(p_z_cuts, fill = group) +
  geom_bar() +
  facet_grid(group ~ .)

Or in tabular form:

xtabs(~ p_z_cuts + group, jaccard_merged_df)
##         group
## p_z_cuts  P1 P31M P3M1  P6 P6M
##   <.0001  69   68   73  68  77
##   <.001    3    0    6   0   1
##   <.01     1    0    2   0   1
##   <.1      0    0    0   0   0
##   >.1    117  122  109 122 111